1   package org.apache.lucene.analysis.in;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.util.BitSet;
21  import java.util.IdentityHashMap;
22  import static java.lang.Character.UnicodeBlock.*;
23  import static org.apache.lucene.analysis.util.StemmerUtil.*;
24  
25  /**
26   * Normalizes the Unicode representation of text in Indian languages.
27   * <p>
28   * Follows guidelines from Unicode 5.2, chapter 6, South Asian Scripts I
29   * and graphical decompositions from http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
30   * </p>
31   */
32  public class IndicNormalizer {
33    
34    private static class ScriptData {
35      final int flag;
36      final int base;
37      BitSet decompMask;
38      
39      ScriptData(int flag, int base) {
40        this.flag = flag;
41        this.base = base;
42      }
43    }
44    
45    private static final IdentityHashMap<Character.UnicodeBlock,ScriptData> scripts = 
46      new IdentityHashMap<>(9);
47    
48    private static int flag(Character.UnicodeBlock ub) {
49      return scripts.get(ub).flag;
50    }
51    
52    static {
53      scripts.put(DEVANAGARI, new ScriptData(1,   0x0900));
54      scripts.put(BENGALI,    new ScriptData(2,   0x0980));
55      scripts.put(GURMUKHI,   new ScriptData(4,   0x0A00));
56      scripts.put(GUJARATI,   new ScriptData(8,   0x0A80));
57      scripts.put(ORIYA,      new ScriptData(16,  0x0B00));
58      scripts.put(TAMIL,      new ScriptData(32,  0x0B80));
59      scripts.put(TELUGU,     new ScriptData(64,  0x0C00));
60      scripts.put(KANNADA,    new ScriptData(128, 0x0C80));
61      scripts.put(MALAYALAM,  new ScriptData(256, 0x0D00));
62    }
63  
64    /**
65     * Decompositions according to Unicode 5.2, 
66     * and http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
67     * 
68     * Most of these are not handled by unicode normalization anyway.
69     * 
70     * The numbers here represent offsets into the respective codepages,
71     * with -1 representing null and 0xFF representing zero-width joiner.
72     * 
73     * the columns are: ch1, ch2, ch3, res, flags
74     * ch1, ch2, and ch3 are the decomposition
75     * res is the composition, and flags are the scripts to which it applies.
76     */
77    private static final int decompositions[][] = {
78        /* devanagari, gujarati vowel candra O */
79        { 0x05, 0x3E, 0x45, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
80        /* devanagari short O */
81        { 0x05, 0x3E, 0x46, 0x12, flag(DEVANAGARI) }, 
82        /* devanagari, gujarati letter O */
83        { 0x05, 0x3E, 0x47, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
84        /* devanagari letter AI, gujarati letter AU */
85        { 0x05, 0x3E, 0x48, 0x14, flag(DEVANAGARI) | flag(GUJARATI) }, 
86        /* devanagari, bengali, gurmukhi, gujarati, oriya AA */
87        { 0x05, 0x3E,   -1, 0x06, flag(DEVANAGARI) | flag(BENGALI) | flag(GURMUKHI) | flag(GUJARATI) | flag(ORIYA) }, 
88        /* devanagari letter candra A */
89        { 0x05, 0x45,   -1, 0x72, flag(DEVANAGARI) },
90        /* gujarati vowel candra E */
91        { 0x05, 0x45,   -1, 0x0D, flag(GUJARATI) },
92        /* devanagari letter short A */
93        { 0x05, 0x46,   -1, 0x04, flag(DEVANAGARI) },
94        /* gujarati letter E */
95        { 0x05, 0x47,   -1, 0x0F, flag(GUJARATI) }, 
96        /* gurmukhi, gujarati letter AI */
97        { 0x05, 0x48,   -1, 0x10, flag(GURMUKHI) | flag(GUJARATI) }, 
98        /* devanagari, gujarati vowel candra O */
99        { 0x05, 0x49,   -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) }, 
100       /* devanagari short O */
101       { 0x05, 0x4A,   -1, 0x12, flag(DEVANAGARI) }, 
102       /* devanagari, gujarati letter O */
103       { 0x05, 0x4B,   -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) }, 
104       /* devanagari letter AI, gurmukhi letter AU, gujarati letter AU */
105       { 0x05, 0x4C,   -1, 0x14, flag(DEVANAGARI) | flag(GURMUKHI) | flag(GUJARATI) }, 
106       /* devanagari, gujarati vowel candra O */
107       { 0x06, 0x45,   -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },  
108       /* devanagari short O */
109       { 0x06, 0x46,   -1, 0x12, flag(DEVANAGARI) },
110       /* devanagari, gujarati letter O */
111       { 0x06, 0x47,   -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
112       /* devanagari letter AI, gujarati letter AU */
113       { 0x06, 0x48,   -1, 0x14, flag(DEVANAGARI) | flag(GUJARATI) },
114       /* malayalam letter II */
115       { 0x07, 0x57,   -1, 0x08, flag(MALAYALAM) },
116       /* devanagari letter UU */
117       { 0x09, 0x41,   -1, 0x0A, flag(DEVANAGARI) },
118       /* tamil, malayalam letter UU (some styles) */
119       { 0x09, 0x57,   -1, 0x0A, flag(TAMIL) | flag(MALAYALAM) },
120       /* malayalam letter AI */
121       { 0x0E, 0x46,   -1, 0x10, flag(MALAYALAM) },
122       /* devanagari candra E */
123       { 0x0F, 0x45,   -1, 0x0D, flag(DEVANAGARI) }, 
124       /* devanagari short E */
125       { 0x0F, 0x46,   -1, 0x0E, flag(DEVANAGARI) },
126       /* devanagari AI */
127       { 0x0F, 0x47,   -1, 0x10, flag(DEVANAGARI) },
128       /* oriya AI */
129       { 0x0F, 0x57,   -1, 0x10, flag(ORIYA) },
130       /* malayalam letter OO */
131       { 0x12, 0x3E,   -1, 0x13, flag(MALAYALAM) }, 
132       /* telugu, kannada letter AU */
133       { 0x12, 0x4C,   -1, 0x14, flag(TELUGU) | flag(KANNADA) }, 
134       /* telugu letter OO */
135       { 0x12, 0x55,   -1, 0x13, flag(TELUGU) },
136       /* tamil, malayalam letter AU */
137       { 0x12, 0x57,   -1, 0x14, flag(TAMIL) | flag(MALAYALAM) },
138       /* oriya letter AU */
139       { 0x13, 0x57,   -1, 0x14, flag(ORIYA) },
140       /* devanagari qa */
141       { 0x15, 0x3C,   -1, 0x58, flag(DEVANAGARI) },
142       /* devanagari, gurmukhi khha */
143       { 0x16, 0x3C,   -1, 0x59, flag(DEVANAGARI) | flag(GURMUKHI) },
144       /* devanagari, gurmukhi ghha */
145       { 0x17, 0x3C,   -1, 0x5A, flag(DEVANAGARI) | flag(GURMUKHI) },
146       /* devanagari, gurmukhi za */
147       { 0x1C, 0x3C,   -1, 0x5B, flag(DEVANAGARI) | flag(GURMUKHI) },
148       /* devanagari dddha, bengali, oriya rra */
149       { 0x21, 0x3C,   -1, 0x5C, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
150       /* devanagari, bengali, oriya rha */
151       { 0x22, 0x3C,   -1, 0x5D, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
152       /* malayalam chillu nn */
153       { 0x23, 0x4D, 0xFF, 0x7A, flag(MALAYALAM) },
154       /* bengali khanda ta */
155       { 0x24, 0x4D, 0xFF, 0x4E, flag(BENGALI) },
156       /* devanagari nnna */
157       { 0x28, 0x3C,   -1, 0x29, flag(DEVANAGARI) },
158       /* malayalam chillu n */
159       { 0x28, 0x4D, 0xFF, 0x7B, flag(MALAYALAM) },
160       /* devanagari, gurmukhi fa */
161       { 0x2B, 0x3C,   -1, 0x5E, flag(DEVANAGARI) | flag(GURMUKHI) },
162       /* devanagari, bengali yya */
163       { 0x2F, 0x3C,   -1, 0x5F, flag(DEVANAGARI) | flag(BENGALI) },
164       /* telugu letter vocalic R */
165       { 0x2C, 0x41, 0x41, 0x0B, flag(TELUGU) },
166       /* devanagari rra */
167       { 0x30, 0x3C,   -1, 0x31, flag(DEVANAGARI) },
168       /* malayalam chillu rr */
169       { 0x30, 0x4D, 0xFF, 0x7C, flag(MALAYALAM) },
170       /* malayalam chillu l */
171       { 0x32, 0x4D, 0xFF, 0x7D, flag(MALAYALAM) },
172       /* devanagari llla */
173       { 0x33, 0x3C,   -1, 0x34, flag(DEVANAGARI) },
174       /* malayalam chillu ll */
175       { 0x33, 0x4D, 0xFF, 0x7E, flag(MALAYALAM) },
176       /* telugu letter MA */ 
177       { 0x35, 0x41,   -1, 0x2E, flag(TELUGU) },
178       /* devanagari, gujarati vowel sign candra O */
179       { 0x3E, 0x45,   -1, 0x49, flag(DEVANAGARI) | flag(GUJARATI) },
180       /* devanagari vowel sign short O */
181       { 0x3E, 0x46,   -1, 0x4A, flag(DEVANAGARI) },
182       /* devanagari, gujarati vowel sign O */
183       { 0x3E, 0x47,   -1, 0x4B, flag(DEVANAGARI) | flag(GUJARATI) },
184       /* devanagari, gujarati vowel sign AU */ 
185       { 0x3E, 0x48,   -1, 0x4C, flag(DEVANAGARI) | flag(GUJARATI) },
186       /* kannada vowel sign II */ 
187       { 0x3F, 0x55,   -1, 0x40, flag(KANNADA) },
188       /* gurmukhi vowel sign UU (when stacking) */
189       { 0x41, 0x41,   -1, 0x42, flag(GURMUKHI) },
190       /* tamil, malayalam vowel sign O */
191       { 0x46, 0x3E,   -1, 0x4A, flag(TAMIL) | flag(MALAYALAM) },
192       /* kannada vowel sign OO */
193       { 0x46, 0x42, 0x55, 0x4B, flag(KANNADA) },
194       /* kannada vowel sign O */
195       { 0x46, 0x42,   -1, 0x4A, flag(KANNADA) },
196       /* malayalam vowel sign AI (if reordered twice) */
197       { 0x46, 0x46,   -1, 0x48, flag(MALAYALAM) },
198       /* telugu, kannada vowel sign EE */
199       { 0x46, 0x55,   -1, 0x47, flag(TELUGU) | flag(KANNADA) },
200       /* telugu, kannada vowel sign AI */
201       { 0x46, 0x56,   -1, 0x48, flag(TELUGU) | flag(KANNADA) },
202       /* tamil, malayalam vowel sign AU */
203       { 0x46, 0x57,   -1, 0x4C, flag(TAMIL) | flag(MALAYALAM) },
204       /* bengali, oriya vowel sign O, tamil, malayalam vowel sign OO */
205       { 0x47, 0x3E,   -1, 0x4B, flag(BENGALI) | flag(ORIYA) | flag(TAMIL) | flag(MALAYALAM) },
206       /* bengali, oriya vowel sign AU */
207       { 0x47, 0x57,   -1, 0x4C, flag(BENGALI) | flag(ORIYA) },
208       /* kannada vowel sign OO */   
209       { 0x4A, 0x55,   -1, 0x4B, flag(KANNADA) },
210       /* gurmukhi letter I */
211       { 0x72, 0x3F,   -1, 0x07, flag(GURMUKHI) },
212       /* gurmukhi letter II */
213       { 0x72, 0x40,   -1, 0x08, flag(GURMUKHI) },
214       /* gurmukhi letter EE */
215       { 0x72, 0x47,   -1, 0x0F, flag(GURMUKHI) },
216       /* gurmukhi letter U */
217       { 0x73, 0x41,   -1, 0x09, flag(GURMUKHI) },
218       /* gurmukhi letter UU */
219       { 0x73, 0x42,   -1, 0x0A, flag(GURMUKHI) },
220       /* gurmukhi letter OO */
221       { 0x73, 0x4B,   -1, 0x13, flag(GURMUKHI) },
222   };
223   
224   static {
225     for (ScriptData sd : scripts.values()) {
226       sd.decompMask = new BitSet(0x7F);
227       for (int i = 0; i < decompositions.length; i++) {
228         final int ch = decompositions[i][0];
229         final int flags = decompositions[i][4];
230         if ((flags & sd.flag) != 0)
231           sd.decompMask.set(ch);
232       }
233     }
234   }
235    
236   /**
237    * Normalizes input text, and returns the new length.
238    * The length will always be less than or equal to the existing length.
239    * 
240    * @param text input text
241    * @param len valid length
242    * @return normalized length
243    */
244   public int normalize(char text[], int len) {
245     for (int i = 0; i < len; i++) {
246       final Character.UnicodeBlock block = Character.UnicodeBlock.of(text[i]);
247       final ScriptData sd = scripts.get(block);
248       if (sd != null) {
249         final int ch = text[i] - sd.base;
250         if (sd.decompMask.get(ch))
251           len = compose(ch, block, sd, text, i, len);
252       }
253     }
254     return len;
255   }
256   
257   /**
258    * Compose into standard form any compositions in the decompositions table.
259    */
260   private int compose(int ch0, Character.UnicodeBlock block0, ScriptData sd, 
261       char text[], int pos, int len) {
262     if (pos + 1 >= len) /* need at least 2 chars! */
263       return len;
264     
265     final int ch1 = text[pos + 1] - sd.base;
266     final Character.UnicodeBlock block1 = Character.UnicodeBlock.of(text[pos + 1]);
267     if (block1 != block0) /* needs to be the same writing system */
268       return len;
269     
270     int ch2 = -1;
271 
272     if (pos + 2 < len) {
273       ch2 = text[pos + 2] - sd.base;
274       Character.UnicodeBlock block2 = Character.UnicodeBlock.of(text[pos + 2]);
275       if (text[pos + 2] == '\u200D') // ZWJ
276         ch2 = 0xFF;
277       else if (block2 != block1)  // still allow a 2-char match
278         ch2 = -1;
279     }
280 
281     for (int i = 0; i < decompositions.length; i++)
282       if (decompositions[i][0] == ch0 && (decompositions[i][4] & sd.flag) != 0) {
283         if (decompositions[i][1] == ch1 && (decompositions[i][2] < 0 || decompositions[i][2] == ch2)) {
284           text[pos] = (char) (sd.base + decompositions[i][3]);
285           len = delete(text, pos + 1, len);
286           if (decompositions[i][2] >= 0)
287             len = delete(text, pos + 1, len);
288           return len;
289         }
290       }
291     
292     return len;
293   }
294 }